library(readxl)

dt = read_excel(
  'DataSets.xlsx', 
  sheet = 'correlation_regression',
  range = 'O11:T486'
)

str(dt)
dt$Breast_feeding[dt$Breast_feeding == 2] = 1
dt$Sex = as.factor(dt$Sex)
dt$Breast_feeding = as.factor(dt$Breast_feeding)
str(dt)
summary(dt)

# Logistic regression model
logmodel = glm(Breast_feeding ~ 
          Sex + Age_mothers +
          Height + Weight,
          family = 
          binomial(link = logit), 
          data = dt)
library(jtools)
summ(logmodel)

# conversion of coefficients to probability via odds (odd ratio = p/(1-p))
# p = odds/(1+odds)
# odds = exp(log(odds))
# coefficients are obtained in log(odds) format

cf = coef(logmodel)
odds = exp(cf)
prob = odds/(1+odds)

alt = data.frame(cf, odds, prob) 
alt

# Exercise: How can we run logistic regression in case continuous dependent variable?
# We need to convert the continuous variable to a dummy or binary variable, usually based on median value.

# Read the salary data set
sal = read_excel(
  'DataSets.xlsx', sheet = 'project',
  range = 'i8:o405'
)

str(sal)

sal$rank = as.factor(sal$rank)
sal$discipline = as.factor(sal$discipline)
sal$sex = as.factor(sal$sex)
str(sal)

# Process the dependent variable
m = median(sal$salary)

# new variable salcat
sal$salcat = ifelse(
  sal$salary > m, 1, 0
)

summary(as.factor(sal$salcat))

library(tidyverse)
sal %>% pull(salcat) %>% as.factor() %>%   summary()


salmod = glm(salcat ~ sex + rank + discipline + yrs.phd + yrs.service,
             family = binomial(link = 'logit'), data = sal)
summ(salmod)
summary(salmod)

# predicted values
pv = predict(salmod, type = 'response')
ob = sal$salcat
iv = sal$yrs.phd

pdt = data.frame(pv, ob, iv)

library(ggplot2)
ggplot(pdt) +
  aes(x = iv, y = ob) +
  geom_point()
